#!/usr/bin/perl
use strict;

my $cmd = "cat home.html | grep href  | awk -Fhref=\\\" \'{print \$2}\' | awk -F\"\\\" title=\\\"\" \'{print \$1 \" \" \$2}\'>url.html";
system($cmd);

open FV, "url.html";
my $line;
my ($url,$name);
my ($prod_url, $prod_name);

sub step_1  {#create path and down index pages
	system("mkdir page");
	while ($line = <FV>) {
		chomp($line);
		if ($line =~ /(.+?) Wholesale.+\>(.+?)<\!\-\-\[if IE 7\]/) {
			($url,$name) = ($1,$2);
			$name =~ s/ /\_/g;
			$name =~ s/\//\_/g;
			print "CategoryName=$name,URL=$url\n";
			`mkdir page/$name`;
		}
		if ($line =~ /(.+?) Wholesale.+\>(.+?)<\/a><\/li>/) {
			($prod_url, $prod_name) = ($1, $2);
			$prod_name =~ s/\&nbsp\;//g;
			$prod_name =~ s/\&//g;
			$prod_name =~ s/ /\_/g;
			$prod_name =~ s/\//\_/g;
			$prod_name =~ s/\,/\_/g;
			$prod_name =~ s/\(/\_/g;
			$prod_name =~ s/\)//g;
			print "CategoryName=$name, ProductName=$prod_name, URL=$prod_url\n";
			system("mkdir \"page/$name/$prod_name\"");
			if ($prod_name =~ /.+\_(\d+)$/) {
				my $prod_cnt = $1;
				my $prod_page = int(($prod_cnt-1)/24+1);
				print "ProductName=$prod_name  ProductCnt=$prod_cnt, ProductPageCnt=$prod_page\n";
				foreach my $idx (1 .. $prod_page) {
					system("wget \"$prod_url?page=$idx\" -O \"page/$name/$prod_name/index_$idx.html\"");
				}
			}
		}
	}

}

sub step_2  {#download detail pages
	system("mkdir page");
	while ($line = <FV>) {
		chomp($line);
		if ($line =~ /(.+?) Wholesale.+\>(.+?)<\!\-\-\[if IE 7\]/) {
			($url,$name) = ($1,$2);
			$name =~ s/ /\_/g;
			$name =~ s/\//\_/g;
			#print "CategoryName=$name,URL=$url\n";
		}
		if ($line =~ /(.+?) Wholesale.+\>(.+?)<\/a><\/li>/) {
			($prod_url, $prod_name) = ($1, $2);
			$prod_name =~ s/\&nbsp\;//g;
			$prod_name =~ s/\&//g;
			$prod_name =~ s/ /\_/g;
			$prod_name =~ s/\//\_/g;
			$prod_name =~ s/\,/\_/g;
			$prod_name =~ s/\(/\_/g;
			$prod_name =~ s/\)//g;
			#print "CategoryName=$name, ProductName=$prod_name, URL=$prod_url\n";
			if ($prod_name =~ /.+\_(\d+)$/) {
				my $prod_cnt = $1;
				my $prod_page = int(($prod_cnt-1)/24+1);
				#print "ProductName=$prod_name  ProductCnt=$prod_cnt, ProductPageCnt=$prod_page\n";
				foreach my $idx (1 .. $prod_page) {
					my $index_file = "./page/$name/$prod_name/index_$idx.html";
					#print "$index_file\n";
					open FINDEX, "$index_file";
					my $index_content;
					while ($index_content=<FINDEX>) {
						chomp($index_content);
						if ($index_content =~ /<a href="(.+?)" class="ih".+?" alt="Wholesale (.+?)" title=" Wholesale /) {
							my ($detail_page, $detail_name) = ($1, $2);
							$detail_name =~ s/\&nbsp\;//g;
							$detail_name =~ s/\&//g;
							$detail_name =~ s/ /\_/g;
							$detail_name =~ s/\//\_/g;
							$detail_name =~ s/\,/\_/g;
							$detail_name =~ s/\(/\_/g;
							$detail_name =~ s/\)//g;
							print "$detail_page, $detail_name\n";
							system("mkdir \"page/$name/$prod_name/detail_$idx\"");
							system("wget \"$detail_page\" -O \"page/$name/$prod_name/detail_$idx/$detail_name.html\"");
						}
					}
				}
			}
		}
	}

}

sub step_3  {#generate image download cmd
	system("mkdir image");
	open FCMD, ">windows_down.cmd";
	print FCMD "mkdir image\n";
	while ($line = <FV>) {
		chomp($line);
		if ($line =~ /(.+?) Wholesale.+\>(.+?)<\!\-\-\[if IE 7\]/) {
			($url,$name) = ($1,$2);
			$name =~ s/ /\_/g;
			$name =~ s/\//\_/g;
			#print "CategoryName=$name,URL=$url\n";
		}
		if ($line =~ /(.+?) Wholesale.+\>(.+?)<\/a><\/li>/) {
			($prod_url, $prod_name) = ($1, $2);
			$prod_name =~ s/\&nbsp\;//g;
			$prod_name =~ s/\&//g;
			$prod_name =~ s/ /\_/g;
			$prod_name =~ s/\//\_/g;
			$prod_name =~ s/\,/\_/g;
			$prod_name =~ s/\(/\_/g;
			$prod_name =~ s/\)//g;
			#print "CategoryName=$name, ProductName=$prod_name, URL=$prod_url\n";
			if ($prod_name =~ /.+\_(\d+)$/) {
				my $prod_cnt = $1;
				my $prod_page = int(($prod_cnt-1)/24+1);
				#print "ProductName=$prod_name  ProductCnt=$prod_cnt, ProductPageCnt=$prod_page\n";
				foreach my $idx (1 .. $prod_page) {
					my $index_file = "./page/$name/$prod_name/index_$idx.html";
					#print "$index_file\n";
					open FINDEX, "$index_file";
					my $index_content;
					while ($index_content=<FINDEX>) {
						chomp($index_content);
						if ($index_content =~ /<a href="(.+?)" class="ih".+?" alt="Wholesale (.+?)" title=" Wholesale /) {
							my ($detail_page, $detail_name) = ($1, $2);
							$detail_name =~ s/\&nbsp\;//g;
							$detail_name =~ s/\&//g;
							$detail_name =~ s/ /\_/g;
							$detail_name =~ s/\//\_/g;
							$detail_name =~ s/\,/\_/g;
							$detail_name =~ s/\(/\_/g;
							$detail_name =~ s/\)//g;
							#print "$detail_page, $detail_name\n";
							my $image_dir = "image\\$name\\$prod_name\\detail_$idx\\$detail_name";
							print FCMD "mkdir \"$image_dir\"\n";
							my $detail_file = "./page/$name/$prod_name/detail_$idx/$detail_name.html";
							open FDETAIL, "$detail_file";
							my $detail_content;
							while ($detail_content = <FDETAIL>) {
								#print "$detail_content";
								while ($detail_content =~ /http\:\/\/www\.otterpart\.com\/images\/s\/(.+?).jpg/g) {
									my $image_name = $1;
									my $down_image_name = $image_name;
									$down_image_name =~ s/\//\_/g;
									print FCMD "wget \"http://www.otterpart.com/images/v/$image_name.jpg\" -O \"$image_dir\\$down_image_name.jpg\"\n";
								}
							}
						}
					}
				}
			}
		}
	}

}

sub main {
	if (scalar(@ARGV)<1) {
		print "usage: ./get_html_linux.pl step_id\n";
		print "example: ./get_html_linux.pl 1\n";
		print "step_id meaning: 1-download index page, 2-download detail page, 3-generate image download cmd\n";
		exit(0);
	}
	if ($ARGV[0] == 1) { #1-download index page
		step_1();
		exit(0);
	}
	if ($ARGV[0] == 2) { #2-download detail page
		step_2();
		exit(0);
	}
	if ($ARGV[0] == 3) { #3-generate image download cmd
		step_3();
		exit(0);
	}
}

main();

